資料輸入與前處理


首先,我們讀入主要用來分析的資料集合。

# Required Library
library("tidyverse")
## ─ Attaching packages ───────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ─ Conflicts ─────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Please set your own working directory
setwd("~/Google 雲端硬碟/[Hahow] R 語言和商業分析課程/Part 2 - 資料科學核心方法/2-3 PCA/Case")

# Read Dataset
financial.data <- read_csv("2017_financial index_163 comp.csv")
## Parsed with column specification:
## cols(
##   comp_id = col_integer(),
##   roe = col_double(),
##   roa = col_double(),
##   profit_margin_rate = col_double(),
##   gross_margin_rate = col_double(),
##   expense_rate = col_double(),
##   asset_turnover = col_double(),
##   inventory_turnover = col_double(),
##   equity_turnnover = col_double(),
##   rev_growth_rate = col_double(),
##   margin_growth_rate = col_double(),
##   op_profit_growth_rate = col_number(),
##   cash_reinv_rate = col_double(),
##   asset_growth_rate = col_double(),
##   current_ratio = col_number(),
##   quick_rartio = col_number(),
##   debt_ratio = col_double()
## )

讓我們來看看「財務資料集合」資料集合,你會發現所有比率的資料都是以 (%) 方式呈現。由於我們會「標準化」後再進行 PCA,所以此時不需要將這些比率除以 100。

head(financial.data, 5)
## # A tibble: 5 x 17
##   comp_id    roe    roa profit_margin_rate gross_margin_rate expense_rate
##     <int>  <dbl>  <dbl>              <dbl>             <dbl>        <dbl>
## 1    2303   3.06   2.21               4.4               18.1         14.8
## 2    2330  23.6   17.8               39.4               50.6         11.0
## 3    2337  25.7   14.3               16.8               37.0         20.1
## 4    2342  -3.41  -0.72               3.86              16.9         13  
## 5    2344  10.9    7.69              14.0               34.3         20.3
## # ... with 11 more variables: asset_turnover <dbl>,
## #   inventory_turnover <dbl>, equity_turnnover <dbl>,
## #   rev_growth_rate <dbl>, margin_growth_rate <dbl>,
## #   op_profit_growth_rate <dbl>, cash_reinv_rate <dbl>,
## #   asset_growth_rate <dbl>, current_ratio <dbl>, quick_rartio <dbl>,
## #   debt_ratio <dbl>

資料探索


讓我們先來看看每一個變數的敘述統計量。

summary(financial.data[, 2:ncol(financial.data)])
##       roe                 roa            profit_margin_rate
##  Min.   :-245.2000   Min.   :-132.5700   Min.   :-252.05   
##  1st Qu.:  -0.2825   1st Qu.:  -0.2225   1st Qu.:  -0.50   
##  Median :   7.9800   Median :   5.8200   Median :   7.72   
##  Mean   :   4.3588   Mean   :   4.1785   Mean   :   3.46   
##  3rd Qu.:  16.5400   3rd Qu.:  11.5100   3rd Qu.:  16.13   
##  Max.   :  37.0100   Max.   :  28.9300   Max.   :  51.67   
##  gross_margin_rate  expense_rate    asset_turnover  inventory_turnover
##  Min.   :-27.60    Min.   :  1.78   Min.   :0.030   Min.   : 0.000    
##  1st Qu.: 18.39    1st Qu.: 10.78   1st Qu.:0.530   1st Qu.: 3.092    
##  Median : 29.91    Median : 19.43   Median :0.720   Median : 4.370    
##  Mean   : 31.19    Mean   : 27.96   Mean   :0.878   Mean   : 6.227    
##  3rd Qu.: 41.48    3rd Qu.: 34.67   3rd Qu.:1.048   3rd Qu.: 6.475    
##  Max.   :100.00    Max.   :255.18   Max.   :5.110   Max.   :56.810    
##  equity_turnnover rev_growth_rate   margin_growth_rate
##  Min.   : 0.030   Min.   :-41.470   Min.   :-328.530  
##  1st Qu.: 0.745   1st Qu.: -4.827   1st Qu.:  -9.572  
##  Median : 1.035   Median :  5.745   Median :   8.200  
##  Mean   : 1.447   Mean   : 12.815   Mean   :  24.434  
##  3rd Qu.: 1.665   3rd Qu.: 17.852   3rd Qu.:  27.150  
##  Max.   :10.080   Max.   :573.570   Max.   : 722.800  
##  op_profit_growth_rate cash_reinv_rate   asset_growth_rate
##  Min.   :-449.33       Min.   :-62.240   Min.   :-52.860  
##  1st Qu.: -15.50       1st Qu.: -3.237   1st Qu.: -3.560  
##  Median :  15.76       Median :  2.620   Median :  4.240  
##  Mean   :  36.69       Mean   :  1.731   Mean   :  7.537  
##  3rd Qu.:  67.62       3rd Qu.:  8.582   3rd Qu.: 13.447  
##  Max.   :1708.73       Max.   : 35.170   Max.   :161.800  
##  current_ratio      quick_rartio       debt_ratio   
##  Min.   :  50.35   Min.   :  19.12   Min.   : 0.97  
##  1st Qu.: 187.37   1st Qu.: 130.28   1st Qu.:17.72  
##  Median : 297.32   Median : 206.93   Median :26.68  
##  Mean   : 394.05   Mean   : 300.61   Mean   :30.27  
##  3rd Qu.: 454.46   3rd Qu.: 328.61   3rd Qu.:43.47  
##  Max.   :3783.42   Max.   :3748.99   Max.   :94.24

為了瞭解變數間比次的相關程度,我們會利用 cor 函數計算變數彼此間的相關係數。

cor(financial.data[, 2:ncol(financial.data)])
##                                 roe          roa profit_margin_rate
## roe                    1.000000e+00  0.927613723        0.769836624
## roa                    9.276137e-01  1.000000000        0.849678033
## profit_margin_rate     7.698366e-01  0.849678033        1.000000000
## gross_margin_rate      2.639333e-01  0.316508192        0.276079736
## expense_rate          -5.639643e-01 -0.602167492       -0.768924244
## asset_turnover         1.479487e-01  0.166463783        0.116560159
## inventory_turnover     6.099483e-02  0.049871996        0.096369338
## equity_turnnover      -8.465143e-02  0.038691090        0.030993761
## rev_growth_rate        1.788903e-01  0.182344013        0.112338020
## margin_growth_rate     4.237200e-02  0.025256508        0.008797824
## op_profit_growth_rate  1.632351e-01  0.146719332        0.138618099
## cash_reinv_rate        5.184223e-01  0.504324078        0.477952503
## asset_growth_rate      3.667274e-01  0.388442882        0.300383311
## current_ratio          4.332876e-05 -0.006202091       -0.077164016
## quick_rartio           5.196865e-02  0.046996374       -0.015659771
## debt_ratio            -1.258534e-01 -0.053587292        0.034495869
##                       gross_margin_rate expense_rate asset_turnover
## roe                          0.26393332  -0.56396427     0.14794874
## roa                          0.31650819  -0.60216749     0.16646378
## profit_margin_rate           0.27607974  -0.76892424     0.11656016
## gross_margin_rate            1.00000000   0.39128488    -0.27028252
## expense_rate                 0.39128488   1.00000000    -0.29526650
## asset_turnover              -0.27028252  -0.29526650     1.00000000
## inventory_turnover          -0.31354281  -0.26958440     0.29470755
## equity_turnnover            -0.33231257  -0.24999003     0.81529132
## rev_growth_rate             -0.04901623  -0.14229687     0.64183724
## margin_growth_rate          -0.03352279  -0.02418444     0.09165550
## op_profit_growth_rate        0.12722264  -0.04348225     0.10178782
## cash_reinv_rate              0.13386636  -0.36882576    -0.02304687
## asset_growth_rate            0.19833667  -0.15952707     0.24830284
## current_ratio                0.47049269   0.37547646    -0.20018000
## quick_rartio                 0.49806646   0.33611349    -0.23187752
## debt_ratio                  -0.43050696  -0.30251126     0.25122647
##                       inventory_turnover equity_turnnover rev_growth_rate
## roe                          0.060994826      -0.08465143     0.178890323
## roa                          0.049871996       0.03869109     0.182344013
## profit_margin_rate           0.096369338       0.03099376     0.112338020
## gross_margin_rate           -0.313542810      -0.33231257    -0.049016229
## expense_rate                -0.269584397      -0.24999003    -0.142296865
## asset_turnover               0.294707548       0.81529132     0.641837236
## inventory_turnover           1.000000000       0.22341445     0.238834411
## equity_turnnover             0.223414455       1.00000000     0.352674052
## rev_growth_rate              0.238834411       0.35267405     1.000000000
## margin_growth_rate          -0.009975919       0.03092636     0.348369257
## op_profit_growth_rate       -0.070522242       0.04649760     0.277928273
## cash_reinv_rate              0.059059882      -0.02463604    -0.019240757
## asset_growth_rate            0.136004623       0.14819216     0.354016268
## current_ratio               -0.159251676      -0.29358895     0.032301834
## quick_rartio                -0.120757063      -0.29589570     0.008888558
## debt_ratio                   0.193927223       0.60197288    -0.006813672
##                       margin_growth_rate op_profit_growth_rate
## roe                          0.042371996          1.632351e-01
## roa                          0.025256508          1.467193e-01
## profit_margin_rate           0.008797824          1.386181e-01
## gross_margin_rate           -0.033522794          1.272226e-01
## expense_rate                -0.024184436         -4.348225e-02
## asset_turnover               0.091655503          1.017878e-01
## inventory_turnover          -0.009975919         -7.052224e-02
## equity_turnnover             0.030926357          4.649760e-02
## rev_growth_rate              0.348369257          2.779283e-01
## margin_growth_rate           1.000000000          3.771601e-01
## op_profit_growth_rate        0.377160119          1.000000e+00
## cash_reinv_rate              0.048577168          1.434912e-01
## asset_growth_rate            0.087707834          2.491875e-01
## current_ratio                0.001143093         -2.123743e-02
## quick_rartio                -0.019162942         -9.133092e-03
## debt_ratio                  -0.053638320         -4.886632e-05
##                       cash_reinv_rate asset_growth_rate current_ratio
## roe                        0.51842229        0.36672740  4.332876e-05
## roa                        0.50432408        0.38844288 -6.202091e-03
## profit_margin_rate         0.47795250        0.30038331 -7.716402e-02
## gross_margin_rate          0.13386636        0.19833667  4.704927e-01
## expense_rate              -0.36882576       -0.15952707  3.754765e-01
## asset_turnover            -0.02304687        0.24830284 -2.001800e-01
## inventory_turnover         0.05905988        0.13600462 -1.592517e-01
## equity_turnnover          -0.02463604        0.14819216 -2.935890e-01
## rev_growth_rate           -0.01924076        0.35401627  3.230183e-02
## margin_growth_rate         0.04857717        0.08770783  1.143093e-03
## op_profit_growth_rate      0.14349123        0.24918754 -2.123743e-02
## cash_reinv_rate            1.00000000        0.05643202 -2.392760e-02
## asset_growth_rate          0.05643202        1.00000000  4.641141e-02
## current_ratio             -0.02392760        0.04641141  1.000000e+00
## quick_rartio               0.03713851        0.06604236  9.753530e-01
## debt_ratio                 0.01151498        0.10378057 -5.861077e-01
##                       quick_rartio    debt_ratio
## roe                    0.051968653 -1.258534e-01
## roa                    0.046996374 -5.358729e-02
## profit_margin_rate    -0.015659771  3.449587e-02
## gross_margin_rate      0.498066464 -4.305070e-01
## expense_rate           0.336113493 -3.025113e-01
## asset_turnover        -0.231877517  2.512265e-01
## inventory_turnover    -0.120757063  1.939272e-01
## equity_turnnover      -0.295895701  6.019729e-01
## rev_growth_rate        0.008888558 -6.813672e-03
## margin_growth_rate    -0.019162942 -5.363832e-02
## op_profit_growth_rate -0.009133092 -4.886632e-05
## cash_reinv_rate        0.037138507  1.151498e-02
## asset_growth_rate      0.066042364  1.037806e-01
## current_ratio          0.975353040 -5.861077e-01
## quick_rartio           1.000000000 -5.246527e-01
## debt_ratio            -0.524652674  1.000000e+00

我通常喜歡使用熱圖 (heatmap) 視覺化變數間的相關程度,如果要使用 ggplot2 繪製相關係數的熱圖,必須先將資料整理成 tidy 的「變數 1 - 變數 2 - 相關係數」資料架構。我們可以利用 reshape2套件中的melt函數輕鬆把矩陣格式轉換成 tidy 資料。

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
head(melt(cor(financial.data[, 2:ncol(financial.data)])), 5)
##                 Var1 Var2      value
## 1                roe  roe  1.0000000
## 2                roa  roe  0.9276137
## 3 profit_margin_rate  roe  0.7698366
## 4  gross_margin_rate  roe  0.2639333
## 5       expense_rate  roe -0.5639643

接著我們就可以用geom_tile(繪製方形圖)繪製相關係數的熱圖囉!可以看到很清楚的變數群聚現象:

ggplot(melt(cor(financial.data[, 2:ncol(financial.data)])),
       aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient2(low = "firebrick4", high = "steelblue",
                       mid = "white", midpoint = 0) +
  guides(fill=guide_legend(title="Correlation")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title = element_blank())

資料建模與分析


首先,我們要建立 PCA 模型,可以利用 R 語言中的 prcomp 函數。其中,如果你希望輸入的資料矩陣先標準化在做參數估計,可以設定 scales = T

pca.model <- prcomp(financial.data[, 2:ncol(financial.data)], scale = T)

names(pca.model)
## [1] "sdev"     "rotation" "center"   "scale"    "x"

可以透過 summary 函數看到主成份分析每個主成份的解釋變異程度,可以發現只要六個主成分就能夠解釋 80% 的資料變異。

summary(pca.model)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     2.0330 1.8739 1.4399 1.16215 1.01323 0.91896
## Proportion of Variance 0.2583 0.2195 0.1296 0.08441 0.06417 0.05278
## Cumulative Proportion  0.2583 0.4778 0.6074 0.69177 0.75594 0.80872
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.87186 0.77645 0.73927 0.67853 0.56103 0.50353
## Proportion of Variance 0.04751 0.03768 0.03416 0.02878 0.01967 0.01585
## Cumulative Proportion  0.85623 0.89391 0.92806 0.95684 0.97651 0.99236
##                           PC13    PC14    PC15    PC16
## Standard deviation     0.27912 0.15178 0.12692 0.07223
## Proportion of Variance 0.00487 0.00144 0.00101 0.00033
## Cumulative Proportion  0.99723 0.99867 0.99967 1.00000

我們也可以透過解釋變異 / 累積解釋比率圖來選取主成份: - var:該主成份解釋變異數的數值 - prop:該主成份解釋變異數的比率 = PC 變異數 / 總變異 - cum_prop:該主成份解釋變異數的累積比率

由於 pca.model 只能夠抓出每一個 PC 的標準差 pca.model$sdev,所以我們需要先建立以下表格,計算出上述各項數值。

var.exp <- tibble(
  pc = paste0("PC_", formatC(1:16, width=2, flag="0")),
  var = pca.model$sdev^2,
  prop = (pca.model$sdev)^2 / sum((pca.model$sdev)^2),
  cum_prop = cumsum((pca.model$sdev)^2 / sum((pca.model$sdev)^2)))

head(var.exp, 5)
## # A tibble: 5 x 4
##   pc      var   prop cum_prop
##   <chr> <dbl>  <dbl>    <dbl>
## 1 PC_01  4.13 0.258     0.258
## 2 PC_02  3.51 0.219     0.478
## 3 PC_03  2.07 0.130     0.607
## 4 PC_04  1.35 0.0844    0.692
## 5 PC_05  1.03 0.0642    0.756

接著,我們可以利用 plotly 套件建立解釋變異的條狀圖。

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(
  x = var.exp$pc,
  y = var.exp$var,
  type = "bar"
) %>%
  layout(
    title = "Variance Explained by Each Principal Component",
    xaxis = list(type = 'Principal Component', tickangle = -60),
    yaxis = list(title = 'Variance'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )
plot_ly(
  x = var.exp$pc,
  y = var.exp$cum_prop,
  type = "bar"
) %>%
  layout(
    title = "Cumulative Proportion by Each Principal Component",
    xaxis = list(type = 'Principal Component', tickangle = -60),
    yaxis = list(title = 'Proportion'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )

從上述分析,建議取前六個主成份即可。接下來,讓我們更仔細的了解每一個主成份的係數,以便解釋主成份內容。如果想要找出主成份的係數矩陣,可以透過 pca.model$rotation

head(pca.model$rotation, 5)
##                            PC1         PC2         PC3         PC4
## roe                 0.35051324 -0.30782632  0.09992653 -0.01265022
## roa                 0.37209821 -0.29926633  0.09449066 -0.04995561
## profit_margin_rate  0.37420084 -0.25303809  0.17092884 -0.03880183
## gross_margin_rate  -0.07651638 -0.40212860 -0.08460058  0.03060840
## expense_rate       -0.40631496 -0.01939565 -0.21713947  0.05973301
##                            PC5         PC6         PC7         PC8
## roe                -0.01349820  0.03456689 -0.15906558  0.07416622
## roa                 0.08224210 -0.01771183 -0.12638004 -0.00498288
## profit_margin_rate  0.02278657 -0.01866967 -0.06950275 -0.16719939
## gross_margin_rate   0.40812461  0.01386915  0.01697671  0.32211008
## expense_rate        0.23611021  0.04571298  0.09960746  0.38295533
##                             PC9        PC10       PC11       PC12
## roe                -0.009130585  0.09361023 -0.2382325  0.5499299
## roa                -0.056233502 -0.08161125 -0.1409867  0.3340492
## profit_margin_rate  0.052479810 -0.34314001  0.2211807 -0.3876528
## gross_margin_rate  -0.272326633 -0.47017407  0.1271494 -0.1790633
## expense_rate       -0.239559563 -0.01790162 -0.1261054  0.2792875
##                           PC13         PC14        PC15        PC16
## roe                -0.37672698 -0.457679257 -0.13920077 -0.03555639
## roa                 0.58984636  0.480985247  0.11922953  0.05209979
## profit_margin_rate -0.12171515  0.004062131 -0.05631017 -0.62759258
## gross_margin_rate  -0.13036971 -0.066220940 -0.01451809  0.43415917
## expense_rate        0.04837154  0.096936721 -0.02701126 -0.63664808

在這裡,我們使用跟共變異數矩陣一樣的視覺化方法。

ggplot(melt(pca.model$rotation[, 1:6]), aes(Var2, Var1)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient2(low = "firebrick4", high = "steelblue",
                       mid = "white", midpoint = 0) +
  guides(fill=guide_legend(title="Coefficient")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title = element_blank())

非負稀疏主成份分析

我們發現上面的主成份其實很難解釋,所以改採用非負稀疏主成份分析,可以利用 nsprcomp 套件中的 nscumcomp 完成,其中有兩個重要的參數:

set.seed(1234)
library(nsprcomp)
nspca.model <- nscumcomp(
  financial.data[, 2:17], 
  k = 90, nneg = T,
  scale. = T)
var.exp <- tibble(
  pc = paste0("PC_", formatC(1:16, width=2, flag="0")),
  var = nspca.model$sdev^2,
  prop = (nspca.model$sdev)^2 / sum((nspca.model$sdev)^2),
  cum_prop = cumsum((nspca.model$sdev)^2 / sum((nspca.model$sdev)^2)))

head(var.exp, 5)
## # A tibble: 5 x 4
##   pc      var   prop cum_prop
##   <chr> <dbl>  <dbl>    <dbl>
## 1 PC_01 1.44  0.158     0.158
## 2 PC_02 1.19  0.132     0.290
## 3 PC_03 1.07  0.118     0.408
## 4 PC_04 0.906 0.0999    0.508
## 5 PC_05 0.888 0.0979    0.606

同樣,我們可以透過解釋變異與累積比率決定主成份個數,此處可以採用 8 個主成份。

library(plotly)

plot_ly(
  x = var.exp$pc,
  y = var.exp$var,
  type = "bar"
) %>%
  layout(
    title = "Variance Explained by Each Principal Component",
    xaxis = list(type = 'Principal Component', tickangle = -60),
    yaxis = list(title = 'Variance'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )
plot_ly(
  x = var.exp$pc,
  y = var.exp$cum_prop,
  type = "bar"
) %>%
  layout(
    title = "Cumulative Proportion by Each Principal Component",
    xaxis = list(type = 'Principal Component', tickangle = -60),
    yaxis = list(title = 'Proportion'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )

接下來,讓我們看看非負稀疏主成份的係數權重。從熱圖中可以看到:

ggplot(melt(nspca.model$rotation[, 1:8]), aes(Var2, Var1)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient2(low = "white", high = "steelblue") +
  guides(fill=guide_legend(title="Coefficient")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title = element_blank())

公司個別分析


在這裡教大家一個找出特別公司的方法,就是繪製「主成份分數」與「該主成份係數最大變數」的散佈圖。比如說下圖中,可以找出幾種特別怪異的公司:

nspca.score <- data.frame(nspca.model$x)
row.names(nspca.score) <- financial.data$comp_id

plot_ly(
  x = nspca.score[, 1],
  y = financial.data$roe,
  text = financial.data$comp_id,
  type = "scatter",
  mode = "markers"
) %>% layout(
    title = "ROE v.s. PC 1 Score: Scatter Plot",
    xaxis = list(title = 'Principal Component 1'),
    yaxis = list(title = 'Return on Equity'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )

另外,透過不同主成份的散佈圖,也可以找到再多種面向都很傑出的公司:

plot_ly(
  x = nspca.score[, 2],
  y = nspca.score[, 3],
  text = financial.data$comp_id,
  type = "scatter",
  mode = "markers"
) %>% layout(
    title = "PC 2 v.s. PC 3 Score: Scatter Plot",
    xaxis = list(title = 'Principal Component 2'),
    yaxis = list(title = 'Principal Component 3'),
    margin = list(r = 30, t = 50, b = 70, l = 50)
  )